import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.express as px
df = pd.read_csv("https://docs.google.com/spreadsheets/d/17_svn8lKuMPh4sl01a8Fca656yLRwbkYD2osTgmrvi8/export?format=csv")
ml_df = df.copy() # Copy of the DataFrame for Machine Learning
df.head()
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FDA15 | 9.30 | Low Fat | 0.016047 | Dairy | 249.8092 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 3735.1380 |
| 1 | DRC01 | 5.92 | Regular | 0.019278 | Soft Drinks | 48.2692 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 443.4228 |
| 2 | FDN15 | 17.50 | Low Fat | 0.016760 | Meat | 141.6180 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 2097.2700 |
| 3 | FDX07 | 19.20 | Regular | 0.000000 | Fruits and Vegetables | 182.0950 | OUT010 | 1998 | NaN | Tier 3 | Grocery Store | 732.3800 |
| 4 | NCD19 | 8.93 | Low Fat | 0.000000 | Household | 53.8614 | OUT013 | 1987 | High | Tier 3 | Supermarket Type1 | 994.7052 |
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4
## Thanks to Greg Hogg https://youtu.be/qNF1HqBvpGE for this solution to Plotly export issues
--2022-07-02 21:45:25-- https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage Resolving github.com (github.com)... 140.82.113.3 Connecting to github.com (github.com)|140.82.113.3|:443... connected. HTTP request sent, awaiting response... 302 Found Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220702%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220702T214526Z&X-Amz-Expires=300&X-Amz-Signature=fcd6c45e4b8e958366e802f81de392ef9ce06fe5eba0f1e9d2f21a7fbdc85654&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream [following] --2022-07-02 21:45:26-- https://objects.githubusercontent.com/github-production-release-asset-2e65be/99037241/9dc3a580-286a-11e9-8a21-4312b7c8a512?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220702%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220702T214526Z&X-Amz-Expires=300&X-Amz-Signature=fcd6c45e4b8e958366e802f81de392ef9ce06fe5eba0f1e9d2f21a7fbdc85654&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=99037241&response-content-disposition=attachment%3B%20filename%3Dorca-1.2.1-x86_64.AppImage&response-content-type=application%2Foctet-stream Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ... Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected. HTTP request sent, awaiting response... 200 OK Length: 51607939 (49M) [application/octet-stream] Saving to: ‘/usr/local/bin/orca’ /usr/local/bin/orca 100%[===================>] 49.22M 30.9MB/s in 1.6s 2022-07-02 21:45:27 (30.9 MB/s) - ‘/usr/local/bin/orca’ saved [51607939/51607939] Reading package lists... Done Building dependency tree Reading state information... Done libgtk2.0-0 is already the newest version (2.24.32-1ubuntu1). libgconf-2-4 is already the newest version (3.2.6-4ubuntu1). xvfb is already the newest version (2:1.19.6-1ubuntu4.10). The following package was automatically installed and is no longer required: libnvidia-common-460 Use 'apt autoremove' to remove it. 0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8523 entries, 0 to 8522 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Item_Identifier 8523 non-null object 1 Item_Weight 7060 non-null float64 2 Item_Fat_Content 8523 non-null object 3 Item_Visibility 8523 non-null float64 4 Item_Type 8523 non-null object 5 Item_MRP 8523 non-null float64 6 Outlet_Identifier 8523 non-null object 7 Outlet_Establishment_Year 8523 non-null int64 8 Outlet_Size 6113 non-null object 9 Outlet_Location_Type 8523 non-null object 10 Outlet_Type 8523 non-null object 11 Item_Outlet_Sales 8523 non-null float64 dtypes: float64(4), int64(1), object(7) memory usage: 799.2+ KB
df.duplicated().sum()
0
df.isna().sum()
Item_Identifier 0 Item_Weight 1463 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
avg_weights_by_type = df.groupby("Item_Type")["Item_Weight"].mean()
for item_type in df["Item_Type"].unique():
df.loc[df["Item_Type"] == item_type,["Item_Weight"]] = \
df.loc[df["Item_Type"] == item_type,["Item_Weight"]].fillna(avg_weights_by_type[item_type])
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 2410 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df["Outlet_Size"].unique()
array(['Medium', nan, 'High', 'Small'], dtype=object)
df.groupby("Outlet_Identifier")[["Outlet_Size","Outlet_Location_Type"]].count()
| Outlet_Size | Outlet_Location_Type | |
|---|---|---|
| Outlet_Identifier | ||
| OUT010 | 0 | 555 |
| OUT013 | 932 | 932 |
| OUT017 | 0 | 926 |
| OUT018 | 928 | 928 |
| OUT019 | 528 | 528 |
| OUT027 | 935 | 935 |
| OUT035 | 930 | 930 |
| OUT045 | 0 | 929 |
| OUT046 | 930 | 930 |
| OUT049 | 930 | 930 |
df["Outlet_Size"].fillna("Unknown", inplace=True)
df.isna().sum()
Item_Identifier 0 Item_Weight 0 Item_Fat_Content 0 Item_Visibility 0 Item_Type 0 Item_MRP 0 Outlet_Identifier 0 Outlet_Establishment_Year 0 Outlet_Size 0 Outlet_Location_Type 0 Outlet_Type 0 Item_Outlet_Sales 0 dtype: int64
df.loc[df["Outlet_Size"]=="High",["Outlet_Size"]] ="Large"
df["Outlet_Size"].unique()
array(['Medium', 'Unknown', 'Large', 'Small'], dtype=object)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 5 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Fat_Content"].unique()
array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)
df["Item_Fat_Content"].replace(["low fat","LF"],"Low Fat", inplace=True)
df["Item_Fat_Content"].replace("reg","Regular", inplace=True)
df.nunique()
Item_Identifier 1559 Item_Weight 431 Item_Fat_Content 2 Item_Visibility 7880 Item_Type 16 Item_MRP 5938 Outlet_Identifier 10 Outlet_Establishment_Year 9 Outlet_Size 4 Outlet_Location_Type 3 Outlet_Type 4 Item_Outlet_Sales 3493 dtype: int64
df["Item_Type"].unique()
array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 | 8523.000000 |
| mean | 12.857890 | 0.066132 | 140.992782 | 1997.831867 | 2181.288914 |
| std | 4.232804 | 0.051598 | 62.275067 | 8.371760 | 1706.499616 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.310000 | 0.026989 | 93.826500 | 1987.000000 | 834.247400 |
| 50% | 12.867061 | 0.053931 | 143.012800 | 1999.000000 | 1794.331000 |
| 75% | 16.000000 | 0.094585 | 185.643700 | 2004.000000 | 3101.296400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 13086.964800 |
fig, ax = plt.subplots()
ax.boxplot([df["Item_Outlet_Sales"]],labels = ["Item Sales"],
notch=True,
patch_artist=True)
plt.show()
sales = df['Item_Outlet_Sales']
q1 = sales.quantile(0.25)
q3 = sales.quantile(0.75)
iqr = q3 - q1
outliers = df[sales > q3 + iqr * 1.5]
outliers
| Item_Identifier | Item_Weight | Item_Fat_Content | Item_Visibility | Item_Type | Item_MRP | Outlet_Identifier | Outlet_Establishment_Year | Outlet_Size | Outlet_Location_Type | Outlet_Type | Item_Outlet_Sales | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 43 | FDC02 | 21.350000 | Low Fat | 0.069103 | Canned | 259.9278 | OUT018 | 2009 | Medium | Tier 3 | Supermarket Type2 | 6768.5228 |
| 130 | FDY25 | 12.305705 | Low Fat | 0.033810 | Canned | 180.5976 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 7968.2944 |
| 132 | NCR53 | 13.142314 | Low Fat | 0.144338 | Health and Hygiene | 224.4404 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6976.2524 |
| 145 | FDP16 | 18.600000 | Low Fat | 0.039356 | Frozen Foods | 246.3802 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7370.4060 |
| 203 | FDI24 | 12.277108 | Low Fat | 0.078362 | Baking Goods | 177.9370 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6704.6060 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8245 | FDU55 | 16.200000 | Low Fat | 0.035967 | Fruits and Vegetables | 260.3278 | OUT049 | 1999 | Medium | Tier 1 | Supermarket Type1 | 7549.5062 |
| 8329 | NCQ06 | 13.384736 | Low Fat | 0.041622 | Household | 253.6014 | OUT027 | 1985 | Medium | Tier 3 | Supermarket Type3 | 6630.0364 |
| 8350 | NCE18 | 10.000000 | Low Fat | 0.021421 | Household | 248.3750 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7240.5750 |
| 8447 | FDS26 | 20.350000 | Low Fat | 0.089975 | Dairy | 261.6594 | OUT017 | 2007 | Unknown | Tier 2 | Supermarket Type1 | 7588.1226 |
| 8510 | FDN58 | 13.800000 | Regular | 0.056862 | Snack Foods | 231.5984 | OUT035 | 2004 | Small | Tier 2 | Supermarket Type1 | 7182.6504 |
186 rows × 12 columns
df.drop(outliers.index,axis=0 ,inplace=True)
df.describe()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Item_Outlet_Sales | |
|---|---|---|---|---|---|
| count | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 | 8337.000000 |
| mean | 12.856296 | 0.066358 | 139.161087 | 1997.968094 | 2059.362844 |
| std | 4.256788 | 0.051838 | 61.553351 | 8.309941 | 1506.099754 |
| min | 4.555000 | 0.000000 | 31.290000 | 1985.000000 | 33.290000 |
| 25% | 9.300000 | 0.027028 | 93.046200 | 1987.000000 | 810.944400 |
| 50% | 12.867061 | 0.053939 | 141.215400 | 1999.000000 | 1747.059200 |
| 75% | 16.100000 | 0.095299 | 183.695000 | 2004.000000 | 2998.097400 |
| max | 21.350000 | 0.328391 | 266.888400 | 2009.000000 | 6478.234000 |
sns.heatmap(df.corr(),cmap="Blues",annot= True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc73c94b50>
x = df['Item_Visibility']
y = df['Item_Outlet_Sales']
a,b = np.polyfit(x,y,1)
fig, ax = plt.subplots()
ax.scatter(x,y, label = "datapoint")
ax.plot(x,a*x+b, "r-", label="Best Fit Line")
ax.legend()
ax.set_title("Total Sales Revenue by Visibility")
ax.set_xlabel("Visiblity")
ax.set_ylabel("Sales Revenue")
plt.show()
histogram = df.groupby("Outlet_Size")["Item_Outlet_Sales"].hist(alpha=0.35,legend=True)
large = df[df["Outlet_Size"]=="Large"]["Item_Outlet_Sales"]
med = df[df["Outlet_Size"]=="Medium"]["Item_Outlet_Sales"]
small = df[df["Outlet_Size"]=="Small"]["Item_Outlet_Sales"]
uk = df[df["Outlet_Size"]=="Unknown"]["Item_Outlet_Sales"]
fig, ax = plt.subplots()
bps = ax.boxplot([small, med, large, uk],labels=["Small","Medium","Large","Unknown"], notch=True,
patch_artist=True)
plt.show()
/usr/local/lib/python3.7/dist-packages/matplotlib/cbook/__init__.py:1376: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
df.groupby("Outlet_Size")["Item_Outlet_Sales"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Outlet_Size | ||||||||
| Large | 917.0 | 2217.100205 | 1398.677744 | 73.2380 | 1057.95620 | 2014.7108 | 3103.95960 | 6474.2392 |
| Medium | 2676.0 | 2458.690424 | 1530.464302 | 69.2432 | 1238.22155 | 2135.8864 | 3448.84400 | 6478.2340 |
| Small | 2362.0 | 1852.606258 | 1483.026265 | 33.9558 | 593.22780 | 1517.3582 | 2775.55375 | 6474.2392 |
| Unknown | 2382.0 | 1755.044328 | 1432.707736 | 33.2900 | 549.28500 | 1417.4882 | 2631.24160 | 6471.5760 |
fig = px.histogram(df,x="Outlet_Identifier", y="Item_Outlet_Sales",
title="Total Sales Per Outlet",
hover_data=['Outlet_Size'],
color ="Outlet_Size",
labels = {"Outlet_Size": "Outlet Size",
"Outlet_Identifier": "Outlet ID",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Size":["Small",
"Medium",
"Large",
"Unknown"]})
py.iplot(fig)
# fig.show("png")
fig = px.histogram(df,x="Item_Type", y="Item_Outlet_Sales",
title="Total Sales Per Item Type",
hover_data=['Outlet_Type'],
color ="Outlet_Type",
labels = {"Outlet_Type": "Outlet Type",
"Item_Type": "Item Type",
"Item_Outlet_Sales": "Total Sales"},
category_orders = {"Outlet_Type":["Supermarket Type1",
"Supermarket Type2",
"Supermarket Type3",
"Grocery Store"]},)
py.iplot(fig)
# fig.show("png")
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
y = ml_df["Item_Outlet_Sales"]
numeric_features = ["Item_Weight",
"Item_Visibility",
"Item_MRP",
"Outlet_Establishment_Year"]
ordinal_encoded_features = ["Outlet_Size",
"Outlet_Type",
"Outlet_Identifier",
"Item_Type",
"Item_Fat_Content"]
# Include all fatures to be included in the model
X = ml_df[numeric_features + ordinal_encoded_features + ["Item_Identifier"]]
X.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | Item_Identifier | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9.30 | 0.016047 | 249.8092 | 1999 | Medium | Supermarket Type1 | OUT049 | Dairy | Low Fat | FDA15 |
| 1 | 5.92 | 0.019278 | 48.2692 | 2009 | Medium | Supermarket Type2 | OUT018 | Soft Drinks | Regular | DRC01 |
| 2 | 17.50 | 0.016760 | 141.6180 | 1999 | Medium | Supermarket Type1 | OUT049 | Meat | Low Fat | FDN15 |
| 3 | 19.20 | 0.000000 | 182.0950 | 1998 | NaN | Grocery Store | OUT010 | Fruits and Vegetables | Regular | FDX07 |
| 4 | 8.93 | 0.000000 | 53.8614 | 1987 | High | Supermarket Type1 | OUT013 | Household | Low Fat | NCD19 |
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)
class WeightsByIdImputer(BaseEstimator, TransformerMixin):
def __init__(self):
self.known_weights = {}
def fit(self, X, y = None):
for id in X["Item_Identifier"].unique():
id_weight_series = X[X["Item_Identifier"] == id ]["Item_Weight"]
if id_weight_series.notna().sum() > 0:
self.known_weights[id] = id_weight_series.mode()[0]
return self
def transform(self, X, y=None):
X_copy = X.copy()
for id in X_copy[X_copy["Item_Weight"].isna()]["Item_Identifier"].unique():
if id in self.known_weights:
X_copy.loc[X_copy["Item_Identifier"] == id,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Identifier"] == id,["Item_Weight"]].fillna(self.known_weights[id])
return X_copy
class WeightsByTypeImputer(BaseEstimator, TransformerMixin):
def __init__(self):
self.averages = {}
def fit(self, X, y = None):
self.averages["default"] = X["Item_Weight"].mean()
for item_type in X["Item_Type"].unique():
self.averages[item_type] = X[X["Item_Type"] == item_type]["Item_Weight"].mean()
return self
def transform(self, X, y=None):
X_copy = X.copy()
for item_type in X["Item_Type"].unique():
if item_type in self.averages:
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]].fillna(self.averages[item_type])
else:
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]] = \
X_copy.loc[X_copy["Item_Type"] == item_type,["Item_Weight"]].fillna(self.averages["default"])
return X_copy
scaler = StandardScaler()
missing_imputer = SimpleImputer(strategy = "constant", fill_value = "Unknown")
ord_enc = OrdinalEncoder()
cat_pipe = make_pipeline(missing_imputer, ord_enc)
transformer = make_column_transformer((scaler,numeric_features),
(cat_pipe, ordinal_encoded_features))
know_weight_imp = WeightsByIdImputer()
weight_type_imp = WeightsByTypeImputer()
pipe = make_pipeline(know_weight_imp,weight_type_imp,transformer)
pipe.fit(X_train,y_train)
X_train = pd.DataFrame(pipe.transform(X_train), columns = numeric_features + ordinal_encoded_features)
X_valid = pd.DataFrame(pipe.transform(X_valid), columns = numeric_features + ordinal_encoded_features)
X_train.head()
| Item_Weight | Item_Visibility | Item_MRP | Outlet_Establishment_Year | Outlet_Size | Outlet_Type | Outlet_Identifier | Item_Type | Item_Fat_Content | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.247802 | 0.858650 | 0.486773 | 0.499522 | 3.0 | 1.0 | 7.0 | 6.0 | 1.0 |
| 1 | -0.065395 | 0.157762 | 1.838334 | 1.095443 | 3.0 | 1.0 | 2.0 | 4.0 | 2.0 |
| 2 | -1.050018 | 0.881784 | 0.207711 | 0.737890 | 2.0 | 1.0 | 6.0 | 13.0 | 1.0 |
| 3 | 0.601052 | 1.096025 | -0.134189 | -1.288241 | 0.0 | 1.0 | 1.0 | 3.0 | 1.0 |
| 4 | -0.237382 | -1.126598 | -0.391272 | 0.737890 | 2.0 | 1.0 | 6.0 | 8.0 | 1.0 |
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression()
def get_MAE(model,X,y):
return mean_absolute_error(y,model.predict(X))
def get_percent(model,X,y):
return 1-(get_MAE(model,X,y)/y.mean())
def get_MSE(model,X,y):
return mean_squared_error(y,model.predict(X))
def get_RMSE(model,X,y):
return np.sqrt(get_MSE(model,X,y))
def get_r2(model,X,y):
return r2_score(y,model.predict(X))
def print_stats(model,X,y):
print("The MAE for this model is {:.2f}".format(get_MAE(model,X,y)))
print("The MSE for this model is {:.2f}".format(get_MSE(model,X,y)))
print("The RMAE for this model is {:.2f}".format(get_RMSE(model,X,y)))
print("The r2 for this model is {:.2f}".format(get_r2(model,X,y)))
print("The accuracy (1 - MAE / y.mean ) for this model is {:%}".format(get_percent(model,X,y)))
print_stats(lin_reg,X_valid,y_valid)
The MAE for this model is 918.55 The MSE for this model is 1457870.80 The RMAE for this model is 1207.42 The r2 for this model is 0.50 The accuracy (1 - MAE / y.mean ) for this model is 57.475388%
model = RandomForestRegressor(random_state=1)
model.fit(X_train,y_train)
print_stats(model,X_valid,y_valid)
The MAE for this model is 794.56 The MSE for this model is 1291149.74 The RMAE for this model is 1136.29 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.215830%
results = {}
for i in range(1,7):
test_model = RandomForestRegressor(n_estimators=i*50,random_state=1)
test_model.fit(X_train,y_train)
results[i*50] = get_MAE(test_model,X_valid,y_valid)
print(f"{i*50}:")
print_stats(test_model,X_valid,y_valid)
50: The MAE for this model is 796.24 The MSE for this model is 1299342.97 The RMAE for this model is 1139.89 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.137704% 100: The MAE for this model is 794.56 The MSE for this model is 1291149.74 The RMAE for this model is 1136.29 The r2 for this model is 0.55 The accuracy (1 - MAE / y.mean ) for this model is 63.215830% 150: The MAE for this model is 793.81 The MSE for this model is 1288509.92 The RMAE for this model is 1135.13 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.250524% 200: The MAE for this model is 791.49 The MSE for this model is 1282953.99 The RMAE for this model is 1132.68 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.357740% 250: The MAE for this model is 792.05 The MSE for this model is 1285482.33 The RMAE for this model is 1133.79 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.331717% 300: The MAE for this model is 790.93 The MSE for this model is 1280898.93 The RMAE for this model is 1131.77 The r2 for this model is 0.56 The accuracy (1 - MAE / y.mean ) for this model is 63.383495%
plt.plot([*results.keys()],[*results.values()] )
plt.gca().invert_yaxis()
plt.show()
from xgboost.sklearn import XGBRegressor
model2 = XGBRegressor(n_estimators = 2000,
learning_rate=0.005,
early_stopping_rounds = 10,
random_state= 1)
model2.fit(X_train,y_train,
eval_set= [(X_valid, y_valid)],
verbose = False)
print_stats(model2,X_valid,y_valid)
[21:46:42] WARNING: /workspace/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror. The MAE for this model is 759.09 The MSE for this model is 1163752.02 The RMAE for this model is 1078.77 The r2 for this model is 0.60 The accuracy (1 - MAE / y.mean ) for this model is 64.857953%
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
model3 = keras.Sequential([
layers.BatchNormalization(input_shape = [X_train.shape[1]]),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.BatchNormalization(),
layers.Dense(256,activation="relu"),
layers.Dropout(0.3),
layers.Dense(1)
])
early_stopping = callbacks.EarlyStopping(
min_delta = 0.001,
patience = 20,
restore_best_weights = True
)
model3.compile(
optimizer="adam",
loss="mse",
metrics=['mae'],
)
history = model3.fit(
X_train, y_train,
validation_data=(X_valid, y_valid),
batch_size=128,
callbacks=[early_stopping],
epochs=100,
verbose=0,
)
history_df = pd.DataFrame(history.history)
history_df['val_mae'].plot()
print_stats(model3,X_valid, y_valid)
The MAE for this model is 752.62 The MSE for this model is 1143255.94 The RMAE for this model is 1069.23 The r2 for this model is 0.61 The accuracy (1 - MAE / y.mean ) for this model is 65.157060%
from sklearn.model_selection import GridSearchCV
param_grid = {
"n_estimators": [x for x in range(100,251,50)],
"max_features": [1.0,"sqrt","log2"],
"min_weight_fraction_leaf": [0.0, 0.1, 0.01],
}
grid_search = GridSearchCV( estimator = RandomForestRegressor(),
param_grid = param_grid,
scoring = "neg_mean_squared_error",
n_jobs = 4,
verbose = 2)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 36 candidates, totalling 180 fits
GridSearchCV(estimator=RandomForestRegressor(), n_jobs=4,
param_grid={'max_features': [1.0, 'sqrt', 'log2'],
'min_weight_fraction_leaf': [0.0, 0.1, 0.01],
'n_estimators': [100, 150, 200, 250]},
scoring='neg_mean_squared_error', verbose=2)
print_stats(grid_search,X_valid,y_valid)
grid_search.best_params_
The MAE for this model is 760.87 The MSE for this model is 1168898.91 The RMAE for this model is 1081.16 The r2 for this model is 0.60 The accuracy (1 - MAE / y.mean ) for this model is 64.775505%
{'max_features': 1.0, 'min_weight_fraction_leaf': 0.01, 'n_estimators': 150}
model3 which was the Dense Nueral Network and try to visualize it's performance¶from plotly.subplots import make_subplots
import plotly.graph_objects as go
predictions = model3.predict(X_valid).flatten()
actual = y_valid.to_numpy().flatten()
trace1 = go.Scatter(x = actual, y = predictions, mode='markers', name = "Predicted vs Actual")
trace2 = go.Scatter(x=[*range(1,7000)],y=[*range(1,7000)], name = "Target Prediction Line")
fig = make_subplots(x_title="Actual Sales", y_title="Predicted Sales", subplot_titles= ["Predicted vs Actual Sale of Predictive Model"])
fig.add_trace(trace1)
fig.add_trace(trace2)
py.iplot(fig)
# fig.show("png")